notebooks/Analyze Clusters.ipynb

{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "80acaea7-0372-46d6-8a52-75da477950a2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "id": "3578115f-c631-4871-8f66-ee7ce9669f88", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "ccbca466-e39a-4223-9fa6-eadba444736c", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"full.csv\", index_col=False)" ] }, { "cell_type": "code", "execution_count": 3, "id": "555c60e1-7a27-4feb-95ad-9ea52af16aad", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>accuracy</th>\n", " <th>kappa</th>\n", " <th>testName</th>\n", " <th>anchorMethod</th>\n", " <th>numAnchors</th>\n", " <th>silBoost</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.750000</td>\n", " <td>0.152318</td>\n", " <td>emily_embeddings.tsv</td>\n", " <td>FIXED</td>\n", " <td>1</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.750000</td>\n", " <td>0.152318</td>\n", " <td>emily_embeddings.tsv</td>\n", " <td>DRIFT</td>\n", " <td>1</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.750000</td>\n", " <td>0.152318</td>\n", " <td>emily_embeddings.tsv</td>\n", " <td>FIXED</td>\n", " <td>1</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.750000</td>\n", " <td>0.152318</td>\n", " <td>emily_embeddings.tsv</td>\n", " <td>DRIFT</td>\n", " <td>1</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.500000</td>\n", " <td>0.055351</td>\n", " <td>emily_embeddings.tsv</td>\n", " <td>FIXED</td>\n", " <td>2</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>4127</th>\n", " <td>0.787234</td>\n", " <td>0.132841</td>\n", " <td>greg_embeddings.tsv</td>\n", " <td>DRIFT</td>\n", " <td>2</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>4128</th>\n", " <td>0.787234</td>\n", " <td>0.132841</td>\n", " <td>greg_embeddings.tsv</td>\n", " <td>FIXED</td>\n", " <td>3</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>4129</th>\n", " <td>0.744681</td>\n", " <td>0.107595</td>\n", " <td>greg_embeddings.tsv</td>\n", " <td>DRIFT</td>\n", " <td>3</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>4130</th>\n", " <td>0.787234</td>\n", " <td>0.132841</td>\n", " <td>greg_embeddings.tsv</td>\n", " <td>FIXED</td>\n", " <td>3</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>4131</th>\n", " <td>0.744681</td>\n", " <td>0.107595</td>\n", " <td>greg_embeddings.tsv</td>\n", " <td>DRIFT</td>\n", " <td>3</td>\n", " <td>1.4</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>4132 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " accuracy kappa testName anchorMethod numAnchors \\\n", "0 0.750000 0.152318 emily_embeddings.tsv FIXED 1 \n", "1 0.750000 0.152318 emily_embeddings.tsv DRIFT 1 \n", "2 0.750000 0.152318 emily_embeddings.tsv FIXED 1 \n", "3 0.750000 0.152318 emily_embeddings.tsv DRIFT 1 \n", "4 0.500000 0.055351 emily_embeddings.tsv FIXED 2 \n", "... ... ... ... ... ... \n", "4127 0.787234 0.132841 greg_embeddings.tsv DRIFT 2 \n", "4128 0.787234 0.132841 greg_embeddings.tsv FIXED 3 \n", "4129 0.744681 0.107595 greg_embeddings.tsv DRIFT 3 \n", "4130 0.787234 0.132841 greg_embeddings.tsv FIXED 3 \n", "4131 0.744681 0.107595 greg_embeddings.tsv DRIFT 3 \n", "\n", " silBoost \n", "0 0.0 \n", "1 0.0 \n", "2 1.4 \n", "3 1.4 \n", "4 0.0 \n", "... ... \n", "4127 1.4 \n", "4128 0.0 \n", "4129 0.0 \n", "4130 1.4 \n", "4131 1.4 \n", "\n", "[4132 rows x 6 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "id": "c49b3dd5-f8e9-43bb-b6ec-64e4c737186b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/1687026169.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " df[df.numAnchors == 3].groupby([\"anchorMethod\"]).mean()\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>accuracy</th>\n", " <th>kappa</th>\n", " <th>numAnchors</th>\n", " <th>silBoost</th>\n", " </tr>\n", " <tr>\n", " <th>anchorMethod</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>DRIFT</th>\n", " <td>0.709526</td>\n", " <td>0.075073</td>\n", " <td>3.0</td>\n", " <td>0.698984</td>\n", " </tr>\n", " <tr>\n", " <th>FIXED</th>\n", " <td>0.802391</td>\n", " <td>0.116437</td>\n", " <td>3.0</td>\n", " <td>0.698984</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " accuracy kappa numAnchors silBoost\n", "anchorMethod \n", "DRIFT 0.709526 0.075073 3.0 0.698984\n", "FIXED 0.802391 0.116437 3.0 0.698984" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.numAnchors == 3].groupby([\"anchorMethod\"]).mean()" ] }, { "cell_type": "code", "execution_count": 5, "id": "cf497610-2df6-4311-9b14-4150fe128870", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/4054225228.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " df[df.numAnchors == 1].groupby([\"anchorMethod\"]).mean()\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>accuracy</th>\n", " <th>kappa</th>\n", " <th>numAnchors</th>\n", " <th>silBoost</th>\n", " </tr>\n", " <tr>\n", " <th>anchorMethod</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>DRIFT</th>\n", " <td>0.841783</td>\n", " <td>0.133174</td>\n", " <td>1.0</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>FIXED</th>\n", " <td>0.856031</td>\n", " <td>0.150774</td>\n", " <td>1.0</td>\n", " <td>0.7</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " accuracy kappa numAnchors silBoost\n", "anchorMethod \n", "DRIFT 0.841783 0.133174 1.0 0.7\n", "FIXED 0.856031 0.150774 1.0 0.7" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.numAnchors == 1].groupby([\"anchorMethod\"]).mean()" ] }, { "cell_type": "code", "execution_count": 6, "id": "85af2aae-af00-4207-9c70-48c850e90f4c", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/3579786468.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " df.groupby([\"anchorMethod\"]).mean()\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>accuracy</th>\n", " <th>kappa</th>\n", " <th>numAnchors</th>\n", " <th>silBoost</th>\n", " </tr>\n", " <tr>\n", " <th>anchorMethod</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>DRIFT</th>\n", " <td>0.761772</td>\n", " <td>0.094499</td>\n", " <td>2.000484</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>FIXED</th>\n", " <td>0.819637</td>\n", " <td>0.130391</td>\n", " <td>2.000484</td>\n", " <td>0.7</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " accuracy kappa numAnchors silBoost\n", "anchorMethod \n", "DRIFT 0.761772 0.094499 2.000484 0.7\n", "FIXED 0.819637 0.130391 2.000484 0.7" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"anchorMethod\"]).mean()" ] }, { "cell_type": "code", "execution_count": 13, "id": "bdf3535e-f706-48d6-b5c0-1dd5a12f4e34", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/Rrando/Documents/GitHub/content-ml-experiments/tab_grouping/analysis\n" ] } ], "source": [ "!pwd" ] }, { "cell_type": "code", "execution_count": 5, "id": "6254f6bc-4eaa-4635-a953-6d6f363d4095", "metadata": {}, "outputs": [], "source": [ "all_pipeline = pd.read_csv(\"./all_pipeline.csv\", index_col=False)" ] }, { "cell_type": "code", "execution_count": 6, "id": "4c6ddd81-032f-41ff-9cda-d4e989b50257", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_31550/2563808502.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline.groupby([\"clustering_method\", \"tf_idf_scale\", \"remap\", \"dbscan_eps\"]).mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>clustering_method</th>\n", " <th>tf_idf_scale</th>\n", " <th>remap</th>\n", " <th>dbscan_eps</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"8\" valign=\"top\">dbscan</th>\n", " <th rowspan=\"2\" valign=\"top\">0.3</th>\n", " <th rowspan=\"2\" valign=\"top\">5</th>\n", " <th>0.3</th>\n", " <td>1339.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.800872</td>\n", " <td>0.364023</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1351.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.794772</td>\n", " <td>0.359055</td>\n", " </tr>\n", " <tr>\n", " <th>0.2</th>\n", " <th>5</th>\n", " <th>0.5</th>\n", " <td>1362.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.791034</td>\n", " <td>0.354760</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>5</th>\n", " <th>0.5</th>\n", " <td>1363.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.799238</td>\n", " <td>0.353957</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">0.0</th>\n", " <th rowspan=\"2\" valign=\"top\">5</th>\n", " <th>0.3</th>\n", " <td>1336.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.793770</td>\n", " <td>0.351563</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1348.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.794378</td>\n", " <td>0.347898</td>\n", " </tr>\n", " <tr>\n", " <th>0.2</th>\n", " <th>5</th>\n", " <th>0.3</th>\n", " <td>1338.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.789821</td>\n", " <td>0.345792</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <th>5</th>\n", " <th>0.5</th>\n", " <td>1361.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.779388</td>\n", " <td>0.340374</td>\n", " </tr>\n", " <tr>\n", " <th>kmeans</th>\n", " <th>0.3</th>\n", " <th>15</th>\n", " <th>0.4</th>\n", " <td>1331.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.743813</td>\n", " <td>0.338441</td>\n", " </tr>\n", " <tr>\n", " <th>dbscan</th>\n", " <th>0.1</th>\n", " <th>5</th>\n", " <th>0.3</th>\n", " <td>1337.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.785502</td>\n", " <td>0.337431</td>\n", " </tr>\n", " <tr>\n", " <th>kmeans</th>\n", " <th>0.0</th>\n", " <th>15</th>\n", " <th>0.4</th>\n", " <td>1328.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.745420</td>\n", " <td>0.337181</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">dbscan</th>\n", " <th>0.0</th>\n", " <th>5</th>\n", " <th>0.5</th>\n", " <td>1360.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.785785</td>\n", " <td>0.333402</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <th>5</th>\n", " <th>0.4</th>\n", " <td>1349.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.782962</td>\n", " <td>0.331806</td>\n", " </tr>\n", " <tr>\n", " <th>0.2</th>\n", " <th>5</th>\n", " <th>0.4</th>\n", " <td>1350.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.785774</td>\n", " <td>0.330820</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"5\" valign=\"top\">kmeans</th>\n", " <th rowspan=\"2\" valign=\"top\">0.2</th>\n", " <th>5</th>\n", " <th>0.4</th>\n", " <td>1326.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.732529</td>\n", " <td>0.330662</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <th>0.4</th>\n", " <td>1330.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.738883</td>\n", " <td>0.325336</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <th>15</th>\n", " <th>0.4</th>\n", " <td>1329.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.742208</td>\n", " <td>0.321906</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>5</th>\n", " <th>0.4</th>\n", " <td>1327.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.726770</td>\n", " <td>0.318727</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <th>5</th>\n", " <th>0.4</th>\n", " <td>1325.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.724333</td>\n", " <td>0.312570</td>\n", " </tr>\n", " <tr>\n", " <th>dbscan</th>\n", " <th>0.3</th>\n", " <th>15</th>\n", " <th>0.4</th>\n", " <td>1355.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.740527</td>\n", " <td>0.301732</td>\n", " </tr>\n", " <tr>\n", " <th>kmeans</th>\n", " <th>0.0</th>\n", " <th>5</th>\n", " <th>0.4</th>\n", " <td>1324.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.713306</td>\n", " <td>0.299799</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"11\" valign=\"top\">dbscan</th>\n", " <th rowspan=\"2\" valign=\"top\">0.2</th>\n", " <th rowspan=\"2\" valign=\"top\">15</th>\n", " <th>0.3</th>\n", " <td>1342.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.739612</td>\n", " <td>0.294205</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1354.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.734766</td>\n", " <td>0.289947</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <th>15</th>\n", " <th>0.5</th>\n", " <td>1364.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.723007</td>\n", " <td>0.288268</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">0.1</th>\n", " <th rowspan=\"2\" valign=\"top\">15</th>\n", " <th>0.5</th>\n", " <td>1365.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.729160</td>\n", " <td>0.287763</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1353.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.727874</td>\n", " <td>0.287064</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <th>15</th>\n", " <th>0.4</th>\n", " <td>1352.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.721996</td>\n", " <td>0.285680</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>15</th>\n", " <th>0.5</th>\n", " <td>1367.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.730880</td>\n", " <td>0.285333</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <th>15</th>\n", " <th>0.3</th>\n", " <td>1340.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.720039</td>\n", " <td>0.283822</td>\n", " </tr>\n", " <tr>\n", " <th>0.2</th>\n", " <th>15</th>\n", " <th>0.5</th>\n", " <td>1366.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.735079</td>\n", " <td>0.283759</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <th>15</th>\n", " <th>0.3</th>\n", " <td>1341.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.732180</td>\n", " <td>0.279688</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>15</th>\n", " <th>0.3</th>\n", " <td>1343.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.728520</td>\n", " <td>0.279203</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">kmeans</th>\n", " <th>0.2</th>\n", " <th>0</th>\n", " <th>0.4</th>\n", " <td>1322.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.735445</td>\n", " <td>0.277878</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <th>0</th>\n", " <th>0.4</th>\n", " <td>1320.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.730139</td>\n", " <td>0.277629</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>0</th>\n", " <th>0.4</th>\n", " <td>1323.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.727076</td>\n", " <td>0.276215</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <th>0</th>\n", " <th>0.4</th>\n", " <td>1321.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.727747</td>\n", " <td>0.271124</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"12\" valign=\"top\">dbscan</th>\n", " <th rowspan=\"2\" valign=\"top\">0.0</th>\n", " <th rowspan=\"2\" valign=\"top\">0</th>\n", " <th>0.5</th>\n", " <td>1356.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.689179</td>\n", " <td>0.256568</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1344.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.687786</td>\n", " <td>0.255828</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">0.1</th>\n", " <th rowspan=\"3\" valign=\"top\">0</th>\n", " <th>0.4</th>\n", " <td>1345.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.686185</td>\n", " <td>0.255206</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <td>1333.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.685546</td>\n", " <td>0.254895</td>\n", " </tr>\n", " <tr>\n", " <th>0.5</th>\n", " <td>1357.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.687280</td>\n", " <td>0.254833</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>0</th>\n", " <th>0.3</th>\n", " <td>1335.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.681074</td>\n", " <td>0.252954</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <th>0</th>\n", " <th>0.3</th>\n", " <td>1332.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.684678</td>\n", " <td>0.252189</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>0</th>\n", " <th>0.4</th>\n", " <td>1347.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.676580</td>\n", " <td>0.249847</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">0.2</th>\n", " <th rowspan=\"3\" valign=\"top\">0</th>\n", " <th>0.3</th>\n", " <td>1334.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.681268</td>\n", " <td>0.249763</td>\n", " </tr>\n", " <tr>\n", " <th>0.5</th>\n", " <td>1358.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.684657</td>\n", " <td>0.249597</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1346.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.681162</td>\n", " <td>0.249232</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <th>0</th>\n", " <th>0.5</th>\n", " <td>1359.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.671968</td>\n", " <td>0.245342</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 history_scale \\\n", "clustering_method tf_idf_scale remap dbscan_eps \n", "dbscan 0.3 5 0.3 1339.0 0.0 \n", " 0.4 1351.0 0.0 \n", " 0.2 5 0.5 1362.0 0.0 \n", " 0.3 5 0.5 1363.0 0.0 \n", " 0.0 5 0.3 1336.0 0.0 \n", " 0.4 1348.0 0.0 \n", " 0.2 5 0.3 1338.0 0.0 \n", " 0.1 5 0.5 1361.0 0.0 \n", "kmeans 0.3 15 0.4 1331.0 0.0 \n", "dbscan 0.1 5 0.3 1337.0 0.0 \n", "kmeans 0.0 15 0.4 1328.0 0.0 \n", "dbscan 0.0 5 0.5 1360.0 0.0 \n", " 0.1 5 0.4 1349.0 0.0 \n", " 0.2 5 0.4 1350.0 0.0 \n", "kmeans 0.2 5 0.4 1326.0 0.0 \n", " 15 0.4 1330.0 0.0 \n", " 0.1 15 0.4 1329.0 0.0 \n", " 0.3 5 0.4 1327.0 0.0 \n", " 0.1 5 0.4 1325.0 0.0 \n", "dbscan 0.3 15 0.4 1355.0 0.0 \n", "kmeans 0.0 5 0.4 1324.0 0.0 \n", "dbscan 0.2 15 0.3 1342.0 0.0 \n", " 0.4 1354.0 0.0 \n", " 0.0 15 0.5 1364.0 0.0 \n", " 0.1 15 0.5 1365.0 0.0 \n", " 0.4 1353.0 0.0 \n", " 0.0 15 0.4 1352.0 0.0 \n", " 0.3 15 0.5 1367.0 0.0 \n", " 0.0 15 0.3 1340.0 0.0 \n", " 0.2 15 0.5 1366.0 0.0 \n", " 0.1 15 0.3 1341.0 0.0 \n", " 0.3 15 0.3 1343.0 0.0 \n", "kmeans 0.2 0 0.4 1322.0 0.0 \n", " 0.0 0 0.4 1320.0 0.0 \n", " 0.3 0 0.4 1323.0 0.0 \n", " 0.1 0 0.4 1321.0 0.0 \n", "dbscan 0.0 0 0.5 1356.0 0.0 \n", " 0.4 1344.0 0.0 \n", " 0.1 0 0.4 1345.0 0.0 \n", " 0.3 1333.0 0.0 \n", " 0.5 1357.0 0.0 \n", " 0.3 0 0.3 1335.0 0.0 \n", " 0.0 0 0.3 1332.0 0.0 \n", " 0.3 0 0.4 1347.0 0.0 \n", " 0.2 0 0.3 1334.0 0.0 \n", " 0.5 1358.0 0.0 \n", " 0.4 1346.0 0.0 \n", " 0.3 0 0.5 1359.0 0.0 \n", "\n", " domain_scale \\\n", "clustering_method tf_idf_scale remap dbscan_eps \n", "dbscan 0.3 5 0.3 0.0 \n", " 0.4 0.0 \n", " 0.2 5 0.5 0.0 \n", " 0.3 5 0.5 0.0 \n", " 0.0 5 0.3 0.0 \n", " 0.4 0.0 \n", " 0.2 5 0.3 0.0 \n", " 0.1 5 0.5 0.0 \n", "kmeans 0.3 15 0.4 0.0 \n", "dbscan 0.1 5 0.3 0.0 \n", "kmeans 0.0 15 0.4 0.0 \n", "dbscan 0.0 5 0.5 0.0 \n", " 0.1 5 0.4 0.0 \n", " 0.2 5 0.4 0.0 \n", "kmeans 0.2 5 0.4 0.0 \n", " 15 0.4 0.0 \n", " 0.1 15 0.4 0.0 \n", " 0.3 5 0.4 0.0 \n", " 0.1 5 0.4 0.0 \n", "dbscan 0.3 15 0.4 0.0 \n", "kmeans 0.0 5 0.4 0.0 \n", "dbscan 0.2 15 0.3 0.0 \n", " 0.4 0.0 \n", " 0.0 15 0.5 0.0 \n", " 0.1 15 0.5 0.0 \n", " 0.4 0.0 \n", " 0.0 15 0.4 0.0 \n", " 0.3 15 0.5 0.0 \n", " 0.0 15 0.3 0.0 \n", " 0.2 15 0.5 0.0 \n", " 0.1 15 0.3 0.0 \n", " 0.3 15 0.3 0.0 \n", "kmeans 0.2 0 0.4 0.0 \n", " 0.0 0 0.4 0.0 \n", " 0.3 0 0.4 0.0 \n", " 0.1 0 0.4 0.0 \n", "dbscan 0.0 0 0.5 0.0 \n", " 0.4 0.0 \n", " 0.1 0 0.4 0.0 \n", " 0.3 0.0 \n", " 0.5 0.0 \n", " 0.3 0 0.3 0.0 \n", " 0.0 0 0.3 0.0 \n", " 0.3 0 0.4 0.0 \n", " 0.2 0 0.3 0.0 \n", " 0.5 0.0 \n", " 0.4 0.0 \n", " 0.3 0 0.5 0.0 \n", "\n", " title_embedding_scale \\\n", "clustering_method tf_idf_scale remap dbscan_eps \n", "dbscan 0.3 5 0.3 1.0 \n", " 0.4 1.0 \n", " 0.2 5 0.5 1.0 \n", " 0.3 5 0.5 1.0 \n", " 0.0 5 0.3 1.0 \n", " 0.4 1.0 \n", " 0.2 5 0.3 1.0 \n", " 0.1 5 0.5 1.0 \n", "kmeans 0.3 15 0.4 1.0 \n", "dbscan 0.1 5 0.3 1.0 \n", "kmeans 0.0 15 0.4 1.0 \n", "dbscan 0.0 5 0.5 1.0 \n", " 0.1 5 0.4 1.0 \n", " 0.2 5 0.4 1.0 \n", "kmeans 0.2 5 0.4 1.0 \n", " 15 0.4 1.0 \n", " 0.1 15 0.4 1.0 \n", " 0.3 5 0.4 1.0 \n", " 0.1 5 0.4 1.0 \n", "dbscan 0.3 15 0.4 1.0 \n", "kmeans 0.0 5 0.4 1.0 \n", "dbscan 0.2 15 0.3 1.0 \n", " 0.4 1.0 \n", " 0.0 15 0.5 1.0 \n", " 0.1 15 0.5 1.0 \n", " 0.4 1.0 \n", " 0.0 15 0.4 1.0 \n", " 0.3 15 0.5 1.0 \n", " 0.0 15 0.3 1.0 \n", " 0.2 15 0.5 1.0 \n", " 0.1 15 0.3 1.0 \n", " 0.3 15 0.3 1.0 \n", "kmeans 0.2 0 0.4 1.0 \n", " 0.0 0 0.4 1.0 \n", " 0.3 0 0.4 1.0 \n", " 0.1 0 0.4 1.0 \n", "dbscan 0.0 0 0.5 1.0 \n", " 0.4 1.0 \n", " 0.1 0 0.4 1.0 \n", " 0.3 1.0 \n", " 0.5 1.0 \n", " 0.3 0 0.3 1.0 \n", " 0.0 0 0.3 1.0 \n", " 0.3 0 0.4 1.0 \n", " 0.2 0 0.3 1.0 \n", " 0.5 1.0 \n", " 0.4 1.0 \n", " 0.3 0 0.5 1.0 \n", "\n", " rand adj_rand \n", "clustering_method tf_idf_scale remap dbscan_eps \n", "dbscan 0.3 5 0.3 0.800872 0.364023 \n", " 0.4 0.794772 0.359055 \n", " 0.2 5 0.5 0.791034 0.354760 \n", " 0.3 5 0.5 0.799238 0.353957 \n", " 0.0 5 0.3 0.793770 0.351563 \n", " 0.4 0.794378 0.347898 \n", " 0.2 5 0.3 0.789821 0.345792 \n", " 0.1 5 0.5 0.779388 0.340374 \n", "kmeans 0.3 15 0.4 0.743813 0.338441 \n", "dbscan 0.1 5 0.3 0.785502 0.337431 \n", "kmeans 0.0 15 0.4 0.745420 0.337181 \n", "dbscan 0.0 5 0.5 0.785785 0.333402 \n", " 0.1 5 0.4 0.782962 0.331806 \n", " 0.2 5 0.4 0.785774 0.330820 \n", "kmeans 0.2 5 0.4 0.732529 0.330662 \n", " 15 0.4 0.738883 0.325336 \n", " 0.1 15 0.4 0.742208 0.321906 \n", " 0.3 5 0.4 0.726770 0.318727 \n", " 0.1 5 0.4 0.724333 0.312570 \n", "dbscan 0.3 15 0.4 0.740527 0.301732 \n", "kmeans 0.0 5 0.4 0.713306 0.299799 \n", "dbscan 0.2 15 0.3 0.739612 0.294205 \n", " 0.4 0.734766 0.289947 \n", " 0.0 15 0.5 0.723007 0.288268 \n", " 0.1 15 0.5 0.729160 0.287763 \n", " 0.4 0.727874 0.287064 \n", " 0.0 15 0.4 0.721996 0.285680 \n", " 0.3 15 0.5 0.730880 0.285333 \n", " 0.0 15 0.3 0.720039 0.283822 \n", " 0.2 15 0.5 0.735079 0.283759 \n", " 0.1 15 0.3 0.732180 0.279688 \n", " 0.3 15 0.3 0.728520 0.279203 \n", "kmeans 0.2 0 0.4 0.735445 0.277878 \n", " 0.0 0 0.4 0.730139 0.277629 \n", " 0.3 0 0.4 0.727076 0.276215 \n", " 0.1 0 0.4 0.727747 0.271124 \n", "dbscan 0.0 0 0.5 0.689179 0.256568 \n", " 0.4 0.687786 0.255828 \n", " 0.1 0 0.4 0.686185 0.255206 \n", " 0.3 0.685546 0.254895 \n", " 0.5 0.687280 0.254833 \n", " 0.3 0 0.3 0.681074 0.252954 \n", " 0.0 0 0.3 0.684678 0.252189 \n", " 0.3 0 0.4 0.676580 0.249847 \n", " 0.2 0 0.3 0.681268 0.249763 \n", " 0.5 0.684657 0.249597 \n", " 0.4 0.681162 0.249232 \n", " 0.3 0 0.5 0.671968 0.245342 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline.groupby([\"clustering_method\", \"tf_idf_scale\", \"remap\", \"dbscan_eps\"]).mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": 38, "id": "13d43adf-06fb-46e5-93cb-37aa161636fb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/1993900622.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline.groupby([\"clustering_method\", \"tf_idf_scale\", \"remap\", \"dbscan_eps\"]).mean().sort_values([\"adj_rand\"], ascending=False).to_csv(\"sorted.csv\")\n" ] } ], "source": [ "all_pipeline.groupby([\"clustering_method\", \"tf_idf_scale\", \"remap\", \"dbscan_eps\"]).mean().sort_values([\"adj_rand\"], ascending=False).to_csv(\"sorted.csv\")" ] }, { "cell_type": "code", "execution_count": 29, "id": "2cf92641-3c00-4638-abf8-9076a493cfb6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/2903459063.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline[(all_pipeline.clustering_method == \"dbscan\") & (all_pipeline.remap == 5)].groupby([\"dbscan_eps\"]).mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>tf_idf_scale</th>\n", " <th>remap</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>dbscan_eps</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0.3</th>\n", " <td>1337.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.15</td>\n", " <td>5.0</td>\n", " <td>0.792492</td>\n", " <td>0.349702</td>\n", " </tr>\n", " <tr>\n", " <th>0.5</th>\n", " <td>1361.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.15</td>\n", " <td>5.0</td>\n", " <td>0.788861</td>\n", " <td>0.345623</td>\n", " </tr>\n", " <tr>\n", " <th>0.4</th>\n", " <td>1349.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.15</td>\n", " <td>5.0</td>\n", " <td>0.789472</td>\n", " <td>0.342395</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 history_scale domain_scale title_embedding_scale \\\n", "dbscan_eps \n", "0.3 1337.5 0.0 0.0 1.0 \n", "0.5 1361.5 0.0 0.0 1.0 \n", "0.4 1349.5 0.0 0.0 1.0 \n", "\n", " tf_idf_scale remap rand adj_rand \n", "dbscan_eps \n", "0.3 0.15 5.0 0.792492 0.349702 \n", "0.5 0.15 5.0 0.788861 0.345623 \n", "0.4 0.15 5.0 0.789472 0.342395 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline[(all_pipeline.clustering_method == \"dbscan\") & (all_pipeline.remap == 5)].groupby([\"dbscan_eps\"]).mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": 31, "id": "c0ec59f4-d54b-4f8b-b296-0e21b963809b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/747050292.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline[(all_pipeline.clustering_method == \"dbscan\") & (all_pipeline.remap == 5)].groupby([\"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>dbscan_eps</th>\n", " <th>remap</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>tf_idf_scale</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0.3</th>\n", " <td>1351.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.798294</td>\n", " <td>0.359011</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <td>1348.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.791311</td>\n", " <td>0.344288</td>\n", " </tr>\n", " <tr>\n", " <th>0.2</th>\n", " <td>1350.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.788877</td>\n", " <td>0.343791</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <td>1349.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.782618</td>\n", " <td>0.336537</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 history_scale domain_scale title_embedding_scale \\\n", "tf_idf_scale \n", "0.3 1351.0 0.0 0.0 1.0 \n", "0.0 1348.0 0.0 0.0 1.0 \n", "0.2 1350.0 0.0 0.0 1.0 \n", "0.1 1349.0 0.0 0.0 1.0 \n", "\n", " dbscan_eps remap rand adj_rand \n", "tf_idf_scale \n", "0.3 0.4 5.0 0.798294 0.359011 \n", "0.0 0.4 5.0 0.791311 0.344288 \n", "0.2 0.4 5.0 0.788877 0.343791 \n", "0.1 0.4 5.0 0.782618 0.336537 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline[(all_pipeline.clustering_method == \"dbscan\") & (all_pipeline.remap == 5)].groupby([\"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": 37, "id": "b502bcd5-eb1f-4525-b2a0-160c821ec84c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_90551/4044017682.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " all_pipeline.drop(\"rand\", axis=1).corr()['adj_rand']\n" ] }, { "data": { "text/plain": [ "Unnamed: 0 -0.128952\n", "history_scale NaN\n", "domain_scale NaN\n", "title_embedding_scale NaN\n", "tf_idf_scale 0.009312\n", "dbscan_eps -0.001607\n", "remap 0.050316\n", "adj_rand 1.000000\n", "Name: adj_rand, dtype: float64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline.drop(\"rand\", axis=1).corr()['adj_rand']" ] }, { "cell_type": "code", "execution_count": 11, "id": "a3b88799-4ce8-4ccf-a28b-8213c8fc061b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_31550/1101770099.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline[(all_pipeline.remap == 0) & (all_pipeline.dbscan_eps == 0.4)].groupby([\"clustering_method\", \"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>dbscan_eps</th>\n", " <th>remap</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>clustering_method</th>\n", " <th>tf_idf_scale</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">kmeans</th>\n", " <th>0.2</th>\n", " <td>1322.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.735445</td>\n", " <td>0.277878</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <td>1320.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.730139</td>\n", " <td>0.277629</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <td>1323.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.727076</td>\n", " <td>0.276215</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <td>1321.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.727747</td>\n", " <td>0.271124</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">dbscan</th>\n", " <th>0.0</th>\n", " <td>1344.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.687786</td>\n", " <td>0.255828</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <td>1345.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.686185</td>\n", " <td>0.255206</td>\n", " </tr>\n", " <tr>\n", " <th>0.3</th>\n", " <td>1347.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.676580</td>\n", " <td>0.249847</td>\n", " </tr>\n", " <tr>\n", " <th>0.2</th>\n", " <td>1346.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.681162</td>\n", " <td>0.249232</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 history_scale domain_scale \\\n", "clustering_method tf_idf_scale \n", "kmeans 0.2 1322.0 0.0 0.0 \n", " 0.0 1320.0 0.0 0.0 \n", " 0.3 1323.0 0.0 0.0 \n", " 0.1 1321.0 0.0 0.0 \n", "dbscan 0.0 1344.0 0.0 0.0 \n", " 0.1 1345.0 0.0 0.0 \n", " 0.3 1347.0 0.0 0.0 \n", " 0.2 1346.0 0.0 0.0 \n", "\n", " title_embedding_scale dbscan_eps remap \\\n", "clustering_method tf_idf_scale \n", "kmeans 0.2 1.0 0.4 0.0 \n", " 0.0 1.0 0.4 0.0 \n", " 0.3 1.0 0.4 0.0 \n", " 0.1 1.0 0.4 0.0 \n", "dbscan 0.0 1.0 0.4 0.0 \n", " 0.1 1.0 0.4 0.0 \n", " 0.3 1.0 0.4 0.0 \n", " 0.2 1.0 0.4 0.0 \n", "\n", " rand adj_rand \n", "clustering_method tf_idf_scale \n", "kmeans 0.2 0.735445 0.277878 \n", " 0.0 0.730139 0.277629 \n", " 0.3 0.727076 0.276215 \n", " 0.1 0.727747 0.271124 \n", "dbscan 0.0 0.687786 0.255828 \n", " 0.1 0.686185 0.255206 \n", " 0.3 0.676580 0.249847 \n", " 0.2 0.681162 0.249232 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline[(all_pipeline.remap == 0) & (all_pipeline.dbscan_eps == 0.4)].groupby([\"clustering_method\", \"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": 12, "id": "f3b8aa85-5fd9-4caf-b274-032a9986354e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_31550/2590128475.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline[(all_pipeline.remap == 0) & (all_pipeline.dbscan_eps == 0.4)].groupby([\"clustering_method\", \"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False).to_csv(\"dbscan_vs_kmeans.csv\")\n" ] } ], "source": [ "all_pipeline[(all_pipeline.remap == 0) & (all_pipeline.dbscan_eps == 0.4)].groupby([\"clustering_method\", \"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False).to_csv(\"dbscan_vs_kmeans.csv\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "4c6aafa1-570a-439c-aa75-979cbe7cd73e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/Rrando/Documents/GitHub/content-ml-experiments/tab_grouping/analysis\n" ] } ], "source": [ "!pwd" ] }, { "cell_type": "code", "execution_count": 14, "id": "cb4810d6-7956-404d-bd29-086b2ab3920c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Unnamed: 0', 'history_scale', 'domain_scale', 'title_embedding_scale',\n", " 'tf_idf_scale', 'clustering_method', 'dbscan_eps', 'remap',\n", " 'num_cluster_method', 'text_for_embedding', 'embedding_model',\n", " 'dataset', 'rand', 'adj_rand'],\n", " dtype='object')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline.columns" ] }, { "cell_type": "code", "execution_count": 16, "id": "ee71e690-7157-43f4-99aa-2699c00ca637", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_31550/143489395.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline.groupby(\"embedding_model\").mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>tf_idf_scale</th>\n", " <th>dbscan_eps</th>\n", " <th>remap</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>embedding_model</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>all-mpnet-base-v2</th>\n", " <td>1367.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.15</td>\n", " <td>0.4</td>\n", " <td>6.666667</td>\n", " <td>0.735767</td>\n", " <td>0.313608</td>\n", " </tr>\n", " <tr>\n", " <th>all-MiniLM-L6-v2</th>\n", " <td>1319.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.15</td>\n", " <td>0.4</td>\n", " <td>6.666667</td>\n", " <td>0.732232</td>\n", " <td>0.282686</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 history_scale domain_scale \\\n", "embedding_model \n", "all-mpnet-base-v2 1367.5 0.0 0.0 \n", "all-MiniLM-L6-v2 1319.5 0.0 0.0 \n", "\n", " title_embedding_scale tf_idf_scale dbscan_eps remap \\\n", "embedding_model \n", "all-mpnet-base-v2 1.0 0.15 0.4 6.666667 \n", "all-MiniLM-L6-v2 1.0 0.15 0.4 6.666667 \n", "\n", " rand adj_rand \n", "embedding_model \n", "all-mpnet-base-v2 0.735767 0.313608 \n", "all-MiniLM-L6-v2 0.732232 0.282686 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline.groupby(\"embedding_model\").mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": 20, "id": "b55976fc-762c-4733-8585-49b723e0fb82", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_31550/1002988956.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " all_pipeline[(all_pipeline.remap == 5) & (all_pipeline.dbscan_eps == 0.4)].groupby([\"embedding_model\", \"clustering_method\", \"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>dbscan_eps</th>\n", " <th>remap</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>embedding_model</th>\n", " <th>clustering_method</th>\n", " <th>tf_idf_scale</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">all-mpnet-base-v2</th>\n", " <th rowspan=\"2\" valign=\"top\">dbscan</th>\n", " <th>0.3</th>\n", " <td>1375.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.803388</td>\n", " <td>0.378215</td>\n", " </tr>\n", " <tr>\n", " <th>0.0</th>\n", " <td>1372.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.806528</td>\n", " <td>0.361670</td>\n", " </tr>\n", " <tr>\n", " <th>kmeans</th>\n", " <th>0.3</th>\n", " <td>1351.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.738482</td>\n", " <td>0.350927</td>\n", " </tr>\n", " <tr>\n", " <th>dbscan</th>\n", " <th>0.1</th>\n", " <td>1373.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.785225</td>\n", " <td>0.347258</td>\n", " </tr>\n", " <tr>\n", " <th>all-MiniLM-L6-v2</th>\n", " <th>dbscan</th>\n", " <th>0.3</th>\n", " <td>1327.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.786157</td>\n", " <td>0.339895</td>\n", " </tr>\n", " <tr>\n", " <th>all-mpnet-base-v2</th>\n", " <th>dbscan</th>\n", " <th>0.2</th>\n", " <td>1374.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.791262</td>\n", " <td>0.338977</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">all-MiniLM-L6-v2</th>\n", " <th>kmeans</th>\n", " <th>0.2</th>\n", " <td>1302.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.738353</td>\n", " <td>0.335513</td>\n", " </tr>\n", " <tr>\n", " <th>dbscan</th>\n", " <th>0.0</th>\n", " <td>1324.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.782228</td>\n", " <td>0.334127</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">all-mpnet-base-v2</th>\n", " <th rowspan=\"2\" valign=\"top\">kmeans</th>\n", " <th>0.2</th>\n", " <td>1350.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.726705</td>\n", " <td>0.325811</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <td>1349.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.723921</td>\n", " <td>0.324468</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">all-MiniLM-L6-v2</th>\n", " <th rowspan=\"2\" valign=\"top\">dbscan</th>\n", " <th>0.2</th>\n", " <td>1326.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.780287</td>\n", " <td>0.322664</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <td>1325.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.780699</td>\n", " <td>0.316355</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">kmeans</th>\n", " <th>0.0</th>\n", " <td>1300.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.725636</td>\n", " <td>0.302768</td>\n", " </tr>\n", " <tr>\n", " <th>0.1</th>\n", " <td>1301.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.724745</td>\n", " <td>0.300673</td>\n", " </tr>\n", " <tr>\n", " <th>all-mpnet-base-v2</th>\n", " <th>kmeans</th>\n", " <th>0.0</th>\n", " <td>1348.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.700976</td>\n", " <td>0.296830</td>\n", " </tr>\n", " <tr>\n", " <th>all-MiniLM-L6-v2</th>\n", " <th>kmeans</th>\n", " <th>0.3</th>\n", " <td>1303.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.4</td>\n", " <td>5.0</td>\n", " <td>0.715058</td>\n", " <td>0.286528</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 history_scale \\\n", "embedding_model clustering_method tf_idf_scale \n", "all-mpnet-base-v2 dbscan 0.3 1375.0 0.0 \n", " 0.0 1372.0 0.0 \n", " kmeans 0.3 1351.0 0.0 \n", " dbscan 0.1 1373.0 0.0 \n", "all-MiniLM-L6-v2 dbscan 0.3 1327.0 0.0 \n", "all-mpnet-base-v2 dbscan 0.2 1374.0 0.0 \n", "all-MiniLM-L6-v2 kmeans 0.2 1302.0 0.0 \n", " dbscan 0.0 1324.0 0.0 \n", "all-mpnet-base-v2 kmeans 0.2 1350.0 0.0 \n", " 0.1 1349.0 0.0 \n", "all-MiniLM-L6-v2 dbscan 0.2 1326.0 0.0 \n", " 0.1 1325.0 0.0 \n", " kmeans 0.0 1300.0 0.0 \n", " 0.1 1301.0 0.0 \n", "all-mpnet-base-v2 kmeans 0.0 1348.0 0.0 \n", "all-MiniLM-L6-v2 kmeans 0.3 1303.0 0.0 \n", "\n", " domain_scale \\\n", "embedding_model clustering_method tf_idf_scale \n", "all-mpnet-base-v2 dbscan 0.3 0.0 \n", " 0.0 0.0 \n", " kmeans 0.3 0.0 \n", " dbscan 0.1 0.0 \n", "all-MiniLM-L6-v2 dbscan 0.3 0.0 \n", "all-mpnet-base-v2 dbscan 0.2 0.0 \n", "all-MiniLM-L6-v2 kmeans 0.2 0.0 \n", " dbscan 0.0 0.0 \n", "all-mpnet-base-v2 kmeans 0.2 0.0 \n", " 0.1 0.0 \n", "all-MiniLM-L6-v2 dbscan 0.2 0.0 \n", " 0.1 0.0 \n", " kmeans 0.0 0.0 \n", " 0.1 0.0 \n", "all-mpnet-base-v2 kmeans 0.0 0.0 \n", "all-MiniLM-L6-v2 kmeans 0.3 0.0 \n", "\n", " title_embedding_scale \\\n", "embedding_model clustering_method tf_idf_scale \n", "all-mpnet-base-v2 dbscan 0.3 1.0 \n", " 0.0 1.0 \n", " kmeans 0.3 1.0 \n", " dbscan 0.1 1.0 \n", "all-MiniLM-L6-v2 dbscan 0.3 1.0 \n", "all-mpnet-base-v2 dbscan 0.2 1.0 \n", "all-MiniLM-L6-v2 kmeans 0.2 1.0 \n", " dbscan 0.0 1.0 \n", "all-mpnet-base-v2 kmeans 0.2 1.0 \n", " 0.1 1.0 \n", "all-MiniLM-L6-v2 dbscan 0.2 1.0 \n", " 0.1 1.0 \n", " kmeans 0.0 1.0 \n", " 0.1 1.0 \n", "all-mpnet-base-v2 kmeans 0.0 1.0 \n", "all-MiniLM-L6-v2 kmeans 0.3 1.0 \n", "\n", " dbscan_eps remap rand \\\n", "embedding_model clustering_method tf_idf_scale \n", "all-mpnet-base-v2 dbscan 0.3 0.4 5.0 0.803388 \n", " 0.0 0.4 5.0 0.806528 \n", " kmeans 0.3 0.4 5.0 0.738482 \n", " dbscan 0.1 0.4 5.0 0.785225 \n", "all-MiniLM-L6-v2 dbscan 0.3 0.4 5.0 0.786157 \n", "all-mpnet-base-v2 dbscan 0.2 0.4 5.0 0.791262 \n", "all-MiniLM-L6-v2 kmeans 0.2 0.4 5.0 0.738353 \n", " dbscan 0.0 0.4 5.0 0.782228 \n", "all-mpnet-base-v2 kmeans 0.2 0.4 5.0 0.726705 \n", " 0.1 0.4 5.0 0.723921 \n", "all-MiniLM-L6-v2 dbscan 0.2 0.4 5.0 0.780287 \n", " 0.1 0.4 5.0 0.780699 \n", " kmeans 0.0 0.4 5.0 0.725636 \n", " 0.1 0.4 5.0 0.724745 \n", "all-mpnet-base-v2 kmeans 0.0 0.4 5.0 0.700976 \n", "all-MiniLM-L6-v2 kmeans 0.3 0.4 5.0 0.715058 \n", "\n", " adj_rand \n", "embedding_model clustering_method tf_idf_scale \n", "all-mpnet-base-v2 dbscan 0.3 0.378215 \n", " 0.0 0.361670 \n", " kmeans 0.3 0.350927 \n", " dbscan 0.1 0.347258 \n", "all-MiniLM-L6-v2 dbscan 0.3 0.339895 \n", "all-mpnet-base-v2 dbscan 0.2 0.338977 \n", "all-MiniLM-L6-v2 kmeans 0.2 0.335513 \n", " dbscan 0.0 0.334127 \n", "all-mpnet-base-v2 kmeans 0.2 0.325811 \n", " 0.1 0.324468 \n", "all-MiniLM-L6-v2 dbscan 0.2 0.322664 \n", " 0.1 0.316355 \n", " kmeans 0.0 0.302768 \n", " 0.1 0.300673 \n", "all-mpnet-base-v2 kmeans 0.0 0.296830 \n", "all-MiniLM-L6-v2 kmeans 0.3 0.286528 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_pipeline[(all_pipeline.remap == 5) & (all_pipeline.dbscan_eps == 0.4)].groupby([\"embedding_model\", \"clustering_method\", \"tf_idf_scale\"]).mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": 5, "id": "38453d81-a134-4673-afe2-36894f165600", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/Rrando/Documents/GitHub/content-ml-experiments/tab_grouping/analysis\n" ] } ], "source": [ "!pwd" ] }, { "cell_type": "code", "execution_count": 2, "id": "819409e2-4374-43ee-94fc-2702cd1eca60", "metadata": {}, "outputs": [], "source": [ "embed_test = pd.read_csv(\"../output/all_pipeline_embedding_test.csv\", index_col=False)" ] }, { "cell_type": "code", "execution_count": 15, "id": "8a90efd0-6dca-46e6-92d1-4efe0585617b", "metadata": {}, "outputs": [], "source": [ "embed_test.loc[embed_test.clustering_method == \"dbscan\", 'num_cluster_method'] = \"none\"" ] }, { "cell_type": "code", "execution_count": 17, "id": "a3a51484-4168-46e1-ad82-046084241cce", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/jq/lt0dyzf14y93d8k18k68_4fm0000gn/T/ipykernel_2593/190544645.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " embed_test[(embed_test.remap == 0) & (embed_test.dbscan_eps == 0.4)].groupby([\"embedding_model\", \"clustering_method\", \"num_cluster_method\"]).mean().sort_values([\"adj_rand\"], ascending=False)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th>Unnamed: 0</th>\n", " <th>history_scale</th>\n", " <th>domain_scale</th>\n", " <th>title_embedding_scale</th>\n", " <th>tf_idf_scale</th>\n", " <th>dbscan_eps</th>\n", " <th>remap</th>\n", " <th>rand</th>\n", " <th>adj_rand</th>\n", " </tr>\n", " <tr>\n", " <th>embedding_model</th>\n", " <th>clustering_method</th>\n", " <th>num_cluster_method</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">all-mpnet-base-v2</th>\n", " <th>kmeans</th>\n", " <th>knee</th>\n", " <td>504.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.726403</td>\n", " <td>0.290479</td>\n", " </tr>\n", " <tr>\n", " <th>dbscan</th>\n", " <th>none</th>\n", " <td>514.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.699151</td>\n", " <td>0.270600</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">all-MiniLM-L6-v2</th>\n", " <th rowspan=\"2\" valign=\"top\">kmeans</th>\n", " <th>silhouette</th>\n", " <td>489.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.754944</td>\n", " <td>0.265861</td>\n", " </tr>\n", " <tr>\n", " <th>knee</th>\n", " <td>486.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.728406</td>\n", " <td>0.254920</td>\n", " </tr>\n", " <tr>\n", " <th>all-mpnet-base-v2</th>\n", " <th>kmeans</th>\n", " <th>silhouette</th>\n", " <td>507.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.726056</td>\n", " <td>0.253460</td>\n", " </tr>\n", " <tr>\n", " <th>all-MiniLM-L6-v2</th>\n", " <th>dbscan</th>\n", " <th>none</th>\n", " <td>496.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.4</td>\n", " <td>0.0</td>\n", " <td>0.675739</td>\n", " <td>0.240188</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Unnamed: 0 \\\n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 504.0 \n", " dbscan none 514.5 \n", "all-MiniLM-L6-v2 kmeans silhouette 489.0 \n", " knee 486.0 \n", "all-mpnet-base-v2 kmeans silhouette 507.0 \n", "all-MiniLM-L6-v2 dbscan none 496.5 \n", "\n", " history_scale \\\n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 0.0 \n", " dbscan none 0.0 \n", "all-MiniLM-L6-v2 kmeans silhouette 0.0 \n", " knee 0.0 \n", "all-mpnet-base-v2 kmeans silhouette 0.0 \n", "all-MiniLM-L6-v2 dbscan none 0.0 \n", "\n", " domain_scale \\\n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 0.0 \n", " dbscan none 0.0 \n", "all-MiniLM-L6-v2 kmeans silhouette 0.0 \n", " knee 0.0 \n", "all-mpnet-base-v2 kmeans silhouette 0.0 \n", "all-MiniLM-L6-v2 dbscan none 0.0 \n", "\n", " title_embedding_scale \\\n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 1.0 \n", " dbscan none 1.0 \n", "all-MiniLM-L6-v2 kmeans silhouette 1.0 \n", " knee 1.0 \n", "all-mpnet-base-v2 kmeans silhouette 1.0 \n", "all-MiniLM-L6-v2 dbscan none 1.0 \n", "\n", " tf_idf_scale \\\n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 0.0 \n", " dbscan none 0.0 \n", "all-MiniLM-L6-v2 kmeans silhouette 0.0 \n", " knee 0.0 \n", "all-mpnet-base-v2 kmeans silhouette 0.0 \n", "all-MiniLM-L6-v2 dbscan none 0.0 \n", "\n", " dbscan_eps remap \\\n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 0.4 0.0 \n", " dbscan none 0.4 0.0 \n", "all-MiniLM-L6-v2 kmeans silhouette 0.4 0.0 \n", " knee 0.4 0.0 \n", "all-mpnet-base-v2 kmeans silhouette 0.4 0.0 \n", "all-MiniLM-L6-v2 dbscan none 0.4 0.0 \n", "\n", " rand adj_rand \n", "embedding_model clustering_method num_cluster_method \n", "all-mpnet-base-v2 kmeans knee 0.726403 0.290479 \n", " dbscan none 0.699151 0.270600 \n", "all-MiniLM-L6-v2 kmeans silhouette 0.754944 0.265861 \n", " knee 0.728406 0.254920 \n", "all-mpnet-base-v2 kmeans silhouette 0.726056 0.253460 \n", "all-MiniLM-L6-v2 dbscan none 0.675739 0.240188 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embed_test[(embed_test.remap == 0) & (embed_test.dbscan_eps == 0.4)].groupby([\"embedding_model\", \"clustering_method\", \"num_cluster_method\"]).mean().sort_values([\"adj_rand\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "3856af8a-3b1f-4d80-8ba5-8bec2b86e76d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b9a76ce0-19ce-4fba-bb41-adae55ea7c6a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a0ab0fe6-7884-4c2c-8273-7074d7b2f77e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }

notebooks/Analyze Clusters.ipynb (2,639 lines of code) (raw):